In [ ]:
%%HTML
<script src="require.js"></script>
In [ ]:
# Library cell
import pandas as pd
import geopandas as gpd
import geoplot
import geoplot.crs as gcrs
import matplotlib.pyplot as plt
import plotly.express as px
# ignore warnings
import warnings
warnings.filterwarnings('ignore')
import plotly.io as pio
pio.renderers.default='notebook'
In [ ]:
# Function cell
# Function cell
## Find non-numeric values
def find_non_numeric_values(df):
    non_numeric_columns = df.select_dtypes(include=['object']).columns
    non_numeric_values = {}
    for col in non_numeric_columns:
        # Change the column to numeric type, if it isn't numeric, it will be converted to NaN
        temp_col = pd.to_numeric(df[col], errors='coerce')      
        # Fill the NaN values with the original values
        non_numeric_data = df[temp_col.isna() & df[col].notna()]   
        if not non_numeric_data.empty:
            non_numeric_values[col] = non_numeric_data[col].tolist()  
    return non_numeric_values

## Remove non-numeric values
def remove_commas_and_convert(df):
    non_numeric_columns = df.select_dtypes(include=['object']).columns
    for col in non_numeric_columns:
        # Check if the column contains any non-numeric values
        try:
           # Remove commas from the column
            temp_col = df[col] = df[col].str.replace(',', '')
            temp_col_numeric = pd.to_numeric(temp_col, errors='raise')  
            # If the column can be converted to numeric, replace the original column with the new column
            df[col] = temp_col_numeric
        except ValueError:
            # If the column contains non-numeric values, keep it
            continue
    return df

LOAD DATA¶

In [ ]:
data = gpd.read_file(r'D:\Repo-train\Jnotebook\FDI_Analytics\geo\diaphantinhenglish.geojson')
df = pd.read_csv(r'D:\Repo-train\Jnotebook\FDI_Analytics\dataset\fdi_provinces_en.csv')
print(type(data))
<class 'geopandas.geodataframe.GeoDataFrame'>
In [ ]:
df.head()
Out[ ]:
Order Provinces Number of new projects Newly registered capital (million USD) Adjusted project number Adjusted capital (million USD) Number of times of capital contribution to buy shares Value of capital contribution, share purchase\n(million USD) Year
0 1 TP. Ho Chi Minh 836 1006.69 222 619.07 1935 1802.56 2016
1 2 Hai Phong 52 2464.32 38 429.24 27 96.34 2016
2 3 Ha Noi 453 1922.76 159 504.47 228 367.21 2016
3 4 Binh Duong 256 1630.52 130 641.97 25 94.72 2016
4 5 Dong Nai 91 1043.74 136 921.05 55 273.44 2016
In [ ]:
df.tail()
Out[ ]:
Order Provinces Number of new projects Newly registered capital (million USD) Adjusted project number Adjusted capital (million USD) Number of times of capital contribution to buy shares Value of capital contribution, share purchase\n(million USD) Year
436 437 Ha Giang NaN NaN NaN NaN NaN NaN 2022
437 438 Lai Chau NaN NaN NaN NaN NaN NaN 2022
438 439 Lao Cai NaN NaN NaN NaN NaN NaN 2022
439 440 Quang Binh NaN NaN NaN NaN NaN NaN 2022
440 441 Son La NaN NaN NaN NaN NaN NaN 2022

Analyze the data¶

In [ ]:
# Drop column Order
n_df = df.drop(columns=['Order'])
# Show shape data
print(n_df.shape, end='\n ---------------- \n')
# Show info data
print(n_df.info(), end='\n ---------------- \n')
# Check for Duplicate
print(n_df.nunique(), end='\n ---------------- \n')
# Check data exist nan or not (bool)
print(n_df.isnull().any(), end='\n ---------------- \n')
# Check for missing value
print(n_df.isna().sum(), end='\n ---------------- \n')
(441, 8)
 ---------------- 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 8 columns):
 #   Column                                                       Non-Null Count  Dtype 
---  ------                                                       --------------  ----- 
 0   Provinces                                                    441 non-null    object
 1   Number of new projects                                       386 non-null    object
 2   Newly registered capital (million USD)                       387 non-null    object
 3   Adjusted project number                                      344 non-null    object
 4   Adjusted capital (million USD)                               344 non-null    object
 5   Number of times of capital contribution to buy shares        378 non-null    object
 6   Value of capital contribution, share purchase
(million USD)  377 non-null    object
 7   Year                                                         441 non-null    int64 
dtypes: int64(1), object(7)
memory usage: 27.7+ KB
None
 ---------------- 
Provinces                                                        63
Number of new projects                                          100
Newly registered capital (million USD)                          351
Adjusted project number                                          76
Adjusted capital (million USD)                                  282
Number of times of capital contribution to buy shares            99
Value of capital contribution, share purchase\n(million USD)    330
Year                                                              7
dtype: int64
 ---------------- 
Provinces                                                       False
Number of new projects                                           True
Newly registered capital (million USD)                           True
Adjusted project number                                          True
Adjusted capital (million USD)                                   True
Number of times of capital contribution to buy shares            True
Value of capital contribution, share purchase\n(million USD)     True
Year                                                            False
dtype: bool
 ---------------- 
Provinces                                                        0
Number of new projects                                          55
Newly registered capital (million USD)                          54
Adjusted project number                                         97
Adjusted capital (million USD)                                  97
Number of times of capital contribution to buy shares           63
Value of capital contribution, share purchase\n(million USD)    64
Year                                                             0
dtype: int64
 ---------------- 

Observations¶

  • The shape of dataset fdi_provinces_en.csv is 441 rows and 8 columns
  • Only Year column dftype int, so we will convert some columns to numeric for consistency to calculate and explore the data.
  • Check all columns to get boolean values indicating if missing values exist and determine which columns have missing values
  • Only Year column not exist missing value.

Data Cleaning¶

Step-by-step¶

  1. Get all "not numeric" from all columns with func find_non_numeric_values()
  2. Format numeric with func remove_commas_and_convert()
  3. Remove special character
  4. Fill all NaN to 0
  5. Drop Province and Year column for consistency data to numeric
  6. Re-execute find_non_numeric_values() to check result
  7. Random select rows to print for review
In [ ]:
## Check for not numeric value
non_numeric_dict = find_non_numeric_values(n_df)
if non_numeric_dict:
    for col, values in non_numeric_dict.items():
        print(f"Column '{col}' have values not numeric:")
        print(values)
else:
    print("No non-numeric values found.")
Column 'Provinces' have values not numeric:
['TP. Ho Chi Minh', 'Hai Phong', 'Ha Noi', 'Binh Duong', 'Dong Nai', 'Bac Giang', 'Bac Ninh', 'Long An', 'Ha Nam', 'Tay Ninh', 'Phu Yen', 'Quang Ninh', 'Ba Ria - Vung Tau', 'Hai Duong', 'Tien Giang', 'Hung Yen', 'Ha Tinh', 'Vinh Phuc', 'Nam Dinh', 'Tra Vinh', 'Can Tho', 'Thanh Hoa', 'Phu Tho', 'Thai Nguyen', 'Vinh Long', 'Quang Nam', 'Binh Phuoc', 'Da Nang', 'Ninh Binh', 'Ninh Thuan', 'Binh Dinh', 'Nghe An', 'Hau Giang', 'Khanh Hoa', 'Thai Binh', 'Tuyen Quang', 'Quang Binh', 'Lam Dong', 'Ben Tre', 'Ca Mau', 'Thua Thien Hue', 'Lao Cai', 'Quang Ngai', 'Dong Thap', 'Ha Giang', 'An Giang', 'Hoa Binh', 'Lang Son', 'Binh Thuan', 'Kon Tum', 'Soc Trang', 'Kien Giang', 'Quang Tri', 'Yen Bai', 'Dak Lak', 'Dak Nong', 'Gia Lai', 'Bac Kan', 'Bac Lieu', 'Dien Bien', 'Cao Bang', 'Lai Chau', 'Son La', 'TP. Ho Chi Minh', 'Bac Ninh', 'Thanh Hoa', 'Binh Duong', 'Khanh Hoa', 'Ha Noi', 'Nam Dinh', 'Dong Nai', 'Kien Giang', 'Tay Ninh', 'Hai Phong', 'Bac Giang', 'Ba Ria - Vung Tau', 'Hung Yen', 'Binh Phuoc', 'Long An', 'Quang Ngai', 'Hai Duong', 'Ninh Thuan', 'Ha Nam', 'Yen Bai', 'Ben Tre', 'Ninh Binh', 'Phu Tho', 'Quang Binh', 'Vinh Phuc', 'Binh Dinh', 'Tien Giang', 'Tra Vinh', 'Da Nang', 'Quang Nam', 'Vinh Long', 'Nghe An', 'Thai Nguyen', 'Thai Binh', 'Ha Tinh', 'Dong Thap', 'Lam Dong', 'Dak Lak', 'Quang Ninh', 'Hoa Binh', 'Binh Thuan', 'Can Tho', 'Dak Nong', 'Ca Mau', 'Soc Trang', 'Lao Cai', 'Son La', 'Cao Bang', 'An Giang', 'Thua Thien Hue', 'Dien Bien', 'Ha Giang', 'Quang Tri', 'Lang Son', 'Tuyen Quang', 'Phu Yen', 'Kon Tum', 'Hau Giang', 'Bac Lieu', 'Bac Kan', 'Gia Lai', 'Lai Chau', 'Ha Noi', 'TP. Ho Chi Minh', 'Hai Phong', 'Binh Duong', 'Ba Ria - Vung Tau', 'Dong Nai', 'Thua Thien Hue', 'Bac Ninh', 'Tay Ninh', 'Long An', 'Hai Duong', 'Bac Giang', 'Binh Phuoc', 'Hung Yen', 'Quang Nam', 'Thai Nguyen', 'Ha Nam', 'Ninh Thuan', 'Quang Ninh', 'Ben Tre', 'Vinh Phuc', 'Bac Lieu', 'Quang Ngai', 'Thanh Hoa', 'Kien Giang', 'Da Nang', 'Nam Dinh', 'Tien Giang', 'Hoa Binh', 'Ninh Binh', 'Vinh Long', 'Phu Tho', 'Binh Dinh', 'Tra Vinh', 'Ha Tinh', 'Khanh Hoa', 'Soc Trang', 'Thai Binh', 'Dak Nong', 'Ca Mau', 'Can Tho', 'Quang Binh', 'Dak Lak', 'Tuyen Quang', 'Nghe An', 'Binh Thuan', 'Phu Yen', 'Lang Son', 'Kon Tum', 'Lam Dong', 'Yen Bai', 'Dong Thap', 'Hau Giang', 'An Giang', 'Son La', 'Lao Cai', 'Quang Tri', 'Ha Giang', 'Bac Kan', 'Cao Bang', 'Dien Bien', 'Lai Chau', 'Gia Lai', 'Ha Noi', 'TP. Ho Chi Minh', 'Binh Duong', 'Dong Nai', 'Bac Ninh', 'Hai Phong', 'Tay Ninh', 'Bac Giang', 'Ba Ria - Vung Tau', 'Ha Nam', 'Long An', 'Hai Duong', 'Thai Nguyen', 'Vinh Phuc', 'Da Nang', 'Hung Yen', 'Binh Phuoc', 'Tien Giang', 'Thanh Hoa', 'Phu Tho', 'Thua Thien Hue', 'Nghe An', 'Quang Ninh', 'Phu Yen', 'Khanh Hoa', 'Quang Nam', 'Binh Thuan', 'Vinh Long', 'Ninh Binh', 'Quang Ngai', 'Ninh Thuan', 'Bac Lieu', 'Soc Trang', 'Tra Vinh', 'Binh Dinh', 'Ca Mau', 'Hau Giang', 'Can Tho', 'Thai Binh', 'An Giang', 'Nam Dinh', 'Ben Tre', 'Ha Tinh', 'Lam Dong', 'Kien Giang', 'Quang Tri', 'Tuyen Quang', 'Dong Thap', 'Yen Bai', 'Kon Tum', 'Bac Kan', 'Dak Lak', 'Lang Son', 'Dien Bien', 'Lao Cai', 'Quang Binh', 'Ha Giang', 'Son La', 'Dak Nong', 'Cao Bang', 'Lai Chau', 'Hoa Binh', 'Gia Lai', 'TP. Ho Chi Minh', 'Bac Lieu', 'Ha Noi', 'Ba Ria - Vung Tau', 'Binh Duong', 'Hai Phong', 'Dong Nai', 'Bac Ninh', 'Bac Giang', 'Long An', 'Ha Nam', 'Ben Tre', 'Tay Ninh', 'Vinh Phuc', 'Hai Duong', 'Quang Ninh', 'Hung Yen', 'Thai Nguyen', 'Binh Phuoc', 'Thanh Hoa', 'Phu Tho', 'Quang Binh', 'Vinh Long', 'Da Nang', 'Nghe An', 'Tien Giang', 'Dak Nong', 'Quang Ngai', 'Tra Vinh', 'Ninh Binh', 'Binh Thuan', 'Nam Dinh', 'Thai Binh', 'Khanh Hoa', 'Soc Trang', 'Binh Dinh', 'Can Tho', 'Thua Thien Hue', 'Ca Mau', 'Hoa Binh', 'Quang Tri', 'Lam Dong', 'Dong Thap', 'Ha Tinh', 'Kien Giang', 'Tuyen Quang', 'Gia Lai', 'Yen Bai', 'Lao Cai', 'An Giang', 'Hau Giang', 'Dak Lak', 'Son La', 'Phu Yen', 'Kon Tum', 'Dien Bien', 'Cao Bang', 'Quang Nam', 'Ninh Thuan', 'Bac Kan', 'Ha Giang', 'Lai Chau', 'Lao Cai', 'Hai Phong', 'Long An', 'TP. Ho Chi Minh', 'Binh Duong', 'Bac Ninh', 'Ha Noi', 'Dong Nai', 'Can Tho', 'Bac Giang', 'Quang Ninh', 'Tay Ninh', 'Vinh Phuc', 'Hung Yen', 'Phu Tho', 'Thai Binh', 'Dak Lak', 'Binh Phuoc', 'Ba Ria - Vung Tau', 'Hai Duong', 'Nghe An', 'Ha Nam', 'Thai Nguyen', 'Thanh Hoa', 'Thua Thien Hue', 'Da Nang', 'Ninh Binh', 'Hau Giang', 'Kon Tum', 'Nam Dinh', 'Tien Giang', 'Binh Dinh', 'Quang Tri', 'Ninh Thuan', 'Quang Binh', 'Yen Bai', 'Vinh Long', 'Dong Thap', 'Quang Ngai', 'Quang Nam', 'Khanh Hoa', 'Ca Mau', 'Binh Thuan', 'Lam Dong', 'Tra Vinh', 'An Giang', 'Ha Tinh', 'Lao Cai', 'Dak Nong', 'Kien Giang', 'Lang Son', 'Gia Lai', 'Phu Yen', 'Cao Bang', 'Lai Chau', 'Soc Trang', 'Bac Lieu', 'Hoa Binh', 'Tuyen Quang', 'Ben Tre', 'Bac Kan', 'Dien Bien', 'Ha Giang', 'Son La', 'TP. Ho Chi Minh', 'Binh Duong', 'Quang Ninh', 'Bac Ninh', 'Hai Phong', 'Ha Noi', 'Thai Nguyen', 'Dong Nai', 'Bac Giang', 'Ba Ria - Vung Tau', 'Nghe An', 'Long An', 'Hung Yen', 'Phu Tho', 'Tay Ninh', 'Ha Nam', 'Hai Duong', 'Thai Binh', 'Ha Tinh', 'Vinh Phuc', 'Binh Phuoc', 'Tien Giang', 'Thua Thien Hue', 'Can Tho', 'Vinh Long', 'Da Nang', 'Soc Trang', 'Thanh Hoa', 'Ninh Thuan', 'Quang Ngai', 'Quang Nam', 'Ninh Binh', 'Nam Dinh', 'Binh Dinh', 'Binh Thuan', 'An Giang', 'Dak Lak', 'Yen Bai', 'Khanh Hoa', 'Kien Giang', 'Tra Vinh', 'Ben Tre', 'Phu Yen', 'Lang Son', 'Hoa Binh', 'Tuyen Quang', 'Dak Nong', 'Kon Tum', 'Quang Tri', 'Hau Giang', 'Ca Mau', 'Gia Lai', 'Lam Dong', 'Bac Lieu', 'Bac Kan', 'Cao Bang', 'Dien Bien', 'Dong Thap', 'Ha Giang', 'Lai Chau', 'Lao Cai', 'Quang Binh', 'Son La']
Column 'Number of new projects' have values not numeric:
[' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ']
Column 'Newly registered capital (million USD)' have values not numeric:
['2,313.95', '3,159.40', '1,356.46', '2,584.86', '1,111.25', '2,134.30', '1,002.38', '1,342.30', ' -   ', '5,041.05', '1,216.58', '1,803.51', '1,382.06', '1,841.35', '1,546.59', '1,296.70', '4,000.00', '1,064.13', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', '3,518.84', '1,170.51', '1,316.82', '1,011.55', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', '1,909.08', '2,181.17', '1,139.00', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ']
Column 'Adjusted project number' have values not numeric:
[' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ']
Column 'Adjusted capital (million USD)' have values not numeric:
['1,000.11', '2,888.34', '1,117.00', '1,829.64', '1,140.00', ' -   ', '1,261.91', '1,489.66', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', '2,727.59', '1,124.28', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', '1,600.72', ' -   ', '1,685.63', '1,212.16', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ', ' -   ']
Column 'Number of times of capital contribution to buy shares' have values not numeric:
['2,788', '3,710', '1,351', '5,720', '3,640', ' -   ', ' -   ', '2,289', ' -   ', ' -   ', ' -   ', ' -   ', '2,411', ' -   ', ' -   ']
Column 'Value of capital contribution, share purchase
(million USD)' have values not numeric:
['3,191.90', '1,703.13', '4,993.11', '6,472.60', '5,595.33', '1,009.55', '3,177.38', ' -   ', '1,611.82', ' -   ', '2,217.58', '1,927.21', ' -   ', ' -   ', ' -   ', ' -   ', '1,738.61', '1,196.48', ' -   ', ' -   ']
In [ ]:
## Drop comma value
n_df = remove_commas_and_convert(n_df)
In [ ]:
# Drop ' - ' value
### Drop ' - ' value column 'Number of new projects'
n_df['Number of new projects'] = n_df['Number of new projects'].replace(to_replace=r'[^0-9.]', value=0, regex=True)
### Drop ' - ' value column 'Newly registered capital (million USD)'
n_df['Newly registered capital (million USD)'] = n_df['Newly registered capital (million USD)'].replace(to_replace=r'[^0-9.]', value=0, regex=True)
### Drop ' - ' value column 'Adjusted project number'
n_df['Adjusted project number'] = n_df['Adjusted project number'].replace(to_replace=r'[^0-9.]', value=0, regex=True)
### Drop ' - ' value column 'Adjusted capital (million USD)'
n_df['Adjusted capital (million USD)'] = n_df['Adjusted capital (million USD)'].replace(to_replace=r'[^0-9.]', value=0, regex=True)
### Drop ' - ' value column 'Number of times of capital contribution to buy shares'
n_df['Number of times of capital contribution to buy shares'] = n_df['Number of times of capital contribution to buy shares'].replace(to_replace=r'[^0-9.]', value=0, regex=True)
### Drop ' - ' value column 'Value of capital contribution, share purchase\n(million USD)'
n_df['Value of capital contribution, share purchase\n(million USD)'] = n_df['Value of capital contribution, share purchase\n(million USD)'].replace(to_replace=r'[^0-9.]', value=0, regex=True)
In [ ]:
## Check for not numeric value
non_numeric_dict = find_non_numeric_values(n_df)
if non_numeric_dict:
    for col, values in non_numeric_dict.items():
        print(f"Column '{col}' have values not numeric:")
        print(values)
else:
    print("No non-numeric values found.")
Column 'Provinces' have values not numeric:
['TP. Ho Chi Minh', 'Hai Phong', 'Ha Noi', 'Binh Duong', 'Dong Nai', 'Bac Giang', 'Bac Ninh', 'Long An', 'Ha Nam', 'Tay Ninh', 'Phu Yen', 'Quang Ninh', 'Ba Ria - Vung Tau', 'Hai Duong', 'Tien Giang', 'Hung Yen', 'Ha Tinh', 'Vinh Phuc', 'Nam Dinh', 'Tra Vinh', 'Can Tho', 'Thanh Hoa', 'Phu Tho', 'Thai Nguyen', 'Vinh Long', 'Quang Nam', 'Binh Phuoc', 'Da Nang', 'Ninh Binh', 'Ninh Thuan', 'Binh Dinh', 'Nghe An', 'Hau Giang', 'Khanh Hoa', 'Thai Binh', 'Tuyen Quang', 'Quang Binh', 'Lam Dong', 'Ben Tre', 'Ca Mau', 'Thua Thien Hue', 'Lao Cai', 'Quang Ngai', 'Dong Thap', 'Ha Giang', 'An Giang', 'Hoa Binh', 'Lang Son', 'Binh Thuan', 'Kon Tum', 'Soc Trang', 'Kien Giang', 'Quang Tri', 'Yen Bai', 'Dak Lak', 'Dak Nong', 'Gia Lai', 'Bac Kan', 'Bac Lieu', 'Dien Bien', 'Cao Bang', 'Lai Chau', 'Son La', 'TP. Ho Chi Minh', 'Bac Ninh', 'Thanh Hoa', 'Binh Duong', 'Khanh Hoa', 'Ha Noi', 'Nam Dinh', 'Dong Nai', 'Kien Giang', 'Tay Ninh', 'Hai Phong', 'Bac Giang', 'Ba Ria - Vung Tau', 'Hung Yen', 'Binh Phuoc', 'Long An', 'Quang Ngai', 'Hai Duong', 'Ninh Thuan', 'Ha Nam', 'Yen Bai', 'Ben Tre', 'Ninh Binh', 'Phu Tho', 'Quang Binh', 'Vinh Phuc', 'Binh Dinh', 'Tien Giang', 'Tra Vinh', 'Da Nang', 'Quang Nam', 'Vinh Long', 'Nghe An', 'Thai Nguyen', 'Thai Binh', 'Ha Tinh', 'Dong Thap', 'Lam Dong', 'Dak Lak', 'Quang Ninh', 'Hoa Binh', 'Binh Thuan', 'Can Tho', 'Dak Nong', 'Ca Mau', 'Soc Trang', 'Lao Cai', 'Son La', 'Cao Bang', 'An Giang', 'Thua Thien Hue', 'Dien Bien', 'Ha Giang', 'Quang Tri', 'Lang Son', 'Tuyen Quang', 'Phu Yen', 'Kon Tum', 'Hau Giang', 'Bac Lieu', 'Bac Kan', 'Gia Lai', 'Lai Chau', 'Ha Noi', 'TP. Ho Chi Minh', 'Hai Phong', 'Binh Duong', 'Ba Ria - Vung Tau', 'Dong Nai', 'Thua Thien Hue', 'Bac Ninh', 'Tay Ninh', 'Long An', 'Hai Duong', 'Bac Giang', 'Binh Phuoc', 'Hung Yen', 'Quang Nam', 'Thai Nguyen', 'Ha Nam', 'Ninh Thuan', 'Quang Ninh', 'Ben Tre', 'Vinh Phuc', 'Bac Lieu', 'Quang Ngai', 'Thanh Hoa', 'Kien Giang', 'Da Nang', 'Nam Dinh', 'Tien Giang', 'Hoa Binh', 'Ninh Binh', 'Vinh Long', 'Phu Tho', 'Binh Dinh', 'Tra Vinh', 'Ha Tinh', 'Khanh Hoa', 'Soc Trang', 'Thai Binh', 'Dak Nong', 'Ca Mau', 'Can Tho', 'Quang Binh', 'Dak Lak', 'Tuyen Quang', 'Nghe An', 'Binh Thuan', 'Phu Yen', 'Lang Son', 'Kon Tum', 'Lam Dong', 'Yen Bai', 'Dong Thap', 'Hau Giang', 'An Giang', 'Son La', 'Lao Cai', 'Quang Tri', 'Ha Giang', 'Bac Kan', 'Cao Bang', 'Dien Bien', 'Lai Chau', 'Gia Lai', 'Ha Noi', 'TP. Ho Chi Minh', 'Binh Duong', 'Dong Nai', 'Bac Ninh', 'Hai Phong', 'Tay Ninh', 'Bac Giang', 'Ba Ria - Vung Tau', 'Ha Nam', 'Long An', 'Hai Duong', 'Thai Nguyen', 'Vinh Phuc', 'Da Nang', 'Hung Yen', 'Binh Phuoc', 'Tien Giang', 'Thanh Hoa', 'Phu Tho', 'Thua Thien Hue', 'Nghe An', 'Quang Ninh', 'Phu Yen', 'Khanh Hoa', 'Quang Nam', 'Binh Thuan', 'Vinh Long', 'Ninh Binh', 'Quang Ngai', 'Ninh Thuan', 'Bac Lieu', 'Soc Trang', 'Tra Vinh', 'Binh Dinh', 'Ca Mau', 'Hau Giang', 'Can Tho', 'Thai Binh', 'An Giang', 'Nam Dinh', 'Ben Tre', 'Ha Tinh', 'Lam Dong', 'Kien Giang', 'Quang Tri', 'Tuyen Quang', 'Dong Thap', 'Yen Bai', 'Kon Tum', 'Bac Kan', 'Dak Lak', 'Lang Son', 'Dien Bien', 'Lao Cai', 'Quang Binh', 'Ha Giang', 'Son La', 'Dak Nong', 'Cao Bang', 'Lai Chau', 'Hoa Binh', 'Gia Lai', 'TP. Ho Chi Minh', 'Bac Lieu', 'Ha Noi', 'Ba Ria - Vung Tau', 'Binh Duong', 'Hai Phong', 'Dong Nai', 'Bac Ninh', 'Bac Giang', 'Long An', 'Ha Nam', 'Ben Tre', 'Tay Ninh', 'Vinh Phuc', 'Hai Duong', 'Quang Ninh', 'Hung Yen', 'Thai Nguyen', 'Binh Phuoc', 'Thanh Hoa', 'Phu Tho', 'Quang Binh', 'Vinh Long', 'Da Nang', 'Nghe An', 'Tien Giang', 'Dak Nong', 'Quang Ngai', 'Tra Vinh', 'Ninh Binh', 'Binh Thuan', 'Nam Dinh', 'Thai Binh', 'Khanh Hoa', 'Soc Trang', 'Binh Dinh', 'Can Tho', 'Thua Thien Hue', 'Ca Mau', 'Hoa Binh', 'Quang Tri', 'Lam Dong', 'Dong Thap', 'Ha Tinh', 'Kien Giang', 'Tuyen Quang', 'Gia Lai', 'Yen Bai', 'Lao Cai', 'An Giang', 'Hau Giang', 'Dak Lak', 'Son La', 'Phu Yen', 'Kon Tum', 'Dien Bien', 'Cao Bang', 'Quang Nam', 'Ninh Thuan', 'Bac Kan', 'Ha Giang', 'Lai Chau', 'Lao Cai', 'Hai Phong', 'Long An', 'TP. Ho Chi Minh', 'Binh Duong', 'Bac Ninh', 'Ha Noi', 'Dong Nai', 'Can Tho', 'Bac Giang', 'Quang Ninh', 'Tay Ninh', 'Vinh Phuc', 'Hung Yen', 'Phu Tho', 'Thai Binh', 'Dak Lak', 'Binh Phuoc', 'Ba Ria - Vung Tau', 'Hai Duong', 'Nghe An', 'Ha Nam', 'Thai Nguyen', 'Thanh Hoa', 'Thua Thien Hue', 'Da Nang', 'Ninh Binh', 'Hau Giang', 'Kon Tum', 'Nam Dinh', 'Tien Giang', 'Binh Dinh', 'Quang Tri', 'Ninh Thuan', 'Quang Binh', 'Yen Bai', 'Vinh Long', 'Dong Thap', 'Quang Ngai', 'Quang Nam', 'Khanh Hoa', 'Ca Mau', 'Binh Thuan', 'Lam Dong', 'Tra Vinh', 'An Giang', 'Ha Tinh', 'Lao Cai', 'Dak Nong', 'Kien Giang', 'Lang Son', 'Gia Lai', 'Phu Yen', 'Cao Bang', 'Lai Chau', 'Soc Trang', 'Bac Lieu', 'Hoa Binh', 'Tuyen Quang', 'Ben Tre', 'Bac Kan', 'Dien Bien', 'Ha Giang', 'Son La', 'TP. Ho Chi Minh', 'Binh Duong', 'Quang Ninh', 'Bac Ninh', 'Hai Phong', 'Ha Noi', 'Thai Nguyen', 'Dong Nai', 'Bac Giang', 'Ba Ria - Vung Tau', 'Nghe An', 'Long An', 'Hung Yen', 'Phu Tho', 'Tay Ninh', 'Ha Nam', 'Hai Duong', 'Thai Binh', 'Ha Tinh', 'Vinh Phuc', 'Binh Phuoc', 'Tien Giang', 'Thua Thien Hue', 'Can Tho', 'Vinh Long', 'Da Nang', 'Soc Trang', 'Thanh Hoa', 'Ninh Thuan', 'Quang Ngai', 'Quang Nam', 'Ninh Binh', 'Nam Dinh', 'Binh Dinh', 'Binh Thuan', 'An Giang', 'Dak Lak', 'Yen Bai', 'Khanh Hoa', 'Kien Giang', 'Tra Vinh', 'Ben Tre', 'Phu Yen', 'Lang Son', 'Hoa Binh', 'Tuyen Quang', 'Dak Nong', 'Kon Tum', 'Quang Tri', 'Hau Giang', 'Ca Mau', 'Gia Lai', 'Lam Dong', 'Bac Lieu', 'Bac Kan', 'Cao Bang', 'Dien Bien', 'Dong Thap', 'Ha Giang', 'Lai Chau', 'Lao Cai', 'Quang Binh', 'Son La']
In [ ]:
## Drop missing value fill with 0
### Number of new projects
n_df['Number of new projects'] = n_df['Number of new projects'].fillna(0)
### Newly registered capital (million USD)
n_df['Newly registered capital (million USD)'] = n_df['Newly registered capital (million USD)'].fillna(0)
### Adjusted project number   
n_df['Adjusted project number'] = n_df['Adjusted project number'].fillna(0)
### Adjusted capital (million USD)
n_df['Adjusted capital (million USD)'] = n_df['Adjusted capital (million USD)'].fillna(0)
### Number of times of capital contribution to buy shares       
n_df['Number of times of capital contribution to buy shares'] = n_df['Number of times of capital contribution to buy shares'].fillna(0)
### Value of capital contribution, share purchase\n(million USD)
n_df['Value of capital contribution, share purchase\n(million USD)'] = n_df['Value of capital contribution, share purchase\n(million USD)'].fillna(0)
In [ ]:
n_df.sample(n=10)
Out[ ]:
Provinces Number of new projects Newly registered capital (million USD) Adjusted project number Adjusted capital (million USD) Number of times of capital contribution to buy shares Value of capital contribution, share purchase\n(million USD) Year
260 Bac Giang 35 395.3 55 432.49 39 66.9 2020
362 Dak Nong 1 7.65 0 0 0 0 2021
240 Dak Lak 0 0 0 0 2 2.56 2019
290 Ca Mau 1 40.77 0 0 1 0.04 2020
89 Binh Dinh 9 117.22 4 31.7 7 1.25 2017
94 Vinh Long 3 55.31 5 72.41 0 0 2017
310 Ninh Thuan 0 0 2 0 9 23.22 2020
273 Quang Binh 3 295.11 1 0 2 0.12 2020
54 Dak Lak 1 0.23 0 0 0 0 2016
137 Bac Giang 67 183.78 36 318.89 44 40.32 2018
In [ ]:
## Data consistency
cols_to_convert = n_df.columns.drop(['Provinces', 'Year'])
n_df[cols_to_convert] = n_df[cols_to_convert].apply(pd.to_numeric, errors='coerce')
## Check for missing value
print(n_df.isnull().values.any())
print(n_df.isna().sum())
False
Provinces                                                       0
Number of new projects                                          0
Newly registered capital (million USD)                          0
Adjusted project number                                         0
Adjusted capital (million USD)                                  0
Number of times of capital contribution to buy shares           0
Value of capital contribution, share purchase\n(million USD)    0
Year                                                            0
dtype: int64

Visualiztion¶

In [ ]:
# Create a new column for the total FDI
n_df['Total FDI'] = n_df['Newly registered capital (million USD)'] + n_df['Adjusted capital (million USD)'] + n_df['Value of capital contribution, share purchase\n(million USD)']
# Merge data with geodata
fullData = data.merge(
    n_df,
    left_on=['Name'],  # identifier from geodataframe
    right_on=['Provinces']  # identifier from dataframe
)

Plot map chart of dataset¶

In [ ]:
years = fullData['Year'].unique()
fullData['Total FDI'] = pd.to_numeric(fullData['Total FDI'], errors='coerce')
for year in years:
    # Filter data for each year
    data_year = fullData[fullData['Year'] == year]
    # Plot with geoplot for each year
    plt.figure(figsize=(12, 8))
    geoplot.choropleth(
        data_year,
        projection=gcrs.AlbersEqualArea(),
        hue="Total FDI",
        cmap='Greens',
        linewidth=0.1,
        edgecolor='black',
        legend=True,
        figsize=(12, 8)
    )
    plt.title(f"Number of New Projects in {year}")
    plt.show()
<Figure size 1200x800 with 0 Axes>
No description has been provided for this image
<Figure size 1200x800 with 0 Axes>
No description has been provided for this image
<Figure size 1200x800 with 0 Axes>
No description has been provided for this image
<Figure size 1200x800 with 0 Axes>
No description has been provided for this image
<Figure size 1200x800 with 0 Axes>
No description has been provided for this image
<Figure size 1200x800 with 0 Axes>
No description has been provided for this image
<Figure size 1200x800 with 0 Axes>
No description has been provided for this image

Note: From the chart above, we can see that foreign investment in Vietnam is concentrated in major cities such as Ho Chi Minh City, Hanoi, Hai Phong,etc. However, a positive sign is that there is also investment spread across various provinces.

Detail with buble chart¶

In [ ]:
years = n_df['Year'].unique()
for year in years:
    # Filter data for each year
    data_year = n_df[n_df['Year'] == year]
    # Plot with Plotly Express
    fig = px.scatter(data_year, x='Provinces', y='Total FDI', size='Total FDI',color='Provinces',
                     title=f'Total FDI by Provinces in {year}',
                     labels={'Total FDI': 'Total Investment (Million USD)', 'Provinces': 'Provinces'},
                     size_max=60)
    fig.update_layout(yaxis_title='Total Investment (Million USD)',
                      xaxis_title='Provinces',
                      title=f'Total FDI by Provinces in {year}')
    fig.show()

Top 10 provinces with the largest total FDI¶

In [ ]:
years = n_df['Year'].unique()
# Loop through each year and plot the total investment by industry
for year in years:
    # Filter data by year
    df_year = n_df[n_df['Year'] == year]
    # Sort the data by Total FDI
    df_year.sort_values('Total FDI', ascending=False, inplace=True)
    # Select the top 10 provinces with the highest Total FDI
    df_year_top10 = df_year.head(10)
    # Set axis values
    x = df_year_top10['Provinces'].values
    y = df_year_top10['Total FDI'].values
    # Plot
    plt.figure(figsize=(10, 6))
    plt.barh(x, y, color='Green')
    plt.xlabel('Total Investment (Million USD)')
    plt.ylabel('Provinces')
    plt.title(f'Total Investment by Provinces in {year}')
    plt.gca().invert_yaxis() 
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

From the ranking plot and multiple barh , we can have below observations:

  • There is still volatility and changes in the rankings: this indicates the diversity of investment sectors, opportunities, and potential existing across different provinces in Vietnam.
  • There are new names appearing in some years: this suggests that the development is on a significant upward trend and continues to attract investors.